import os
import sys

from opencc import OpenCC
from tqdm import tqdm
from zkl_datasets import ChunkedBinaryDatasetWriter, load_dataset
from zkl_promptui import confirm_clear_dir_path

root_dir_path = os.path.join(os.path.dirname(__file__), "../../..")
sys.path.append(root_dir_path)

from scripts.config import datasets_dir_path

src_dataset_path = os.path.join(datasets_dir_path, "wikipedia-text/zh/20231101")
dst_dataset_path = os.path.join(datasets_dir_path, "wikipedia-zh-cleaned")

confirm_clear_dir_path(dst_dataset_path)
org_dataset = load_dataset(src_dataset_path)
dst_dataset_writer = ChunkedBinaryDatasetWriter(dst_dataset_path)

opencc = OpenCC('t2s')
for sample in tqdm(org_dataset):
    sample = str(sample, encoding="utf-8")
    if len(sample) <= 64:  # 过滤太短的词条
        continue
    sample = opencc.convert(sample)
    sample = sample.encode("utf-8")
    dst_dataset_writer.append(sample)
